package getGaokaoXuankeData.controller;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CatchWord {
    private static  String global_url = "http://10.6.150.162:8020/";
    private static int current_num = 1;//读取Excel中的值
    //private static String excel_path01 = "G:\\desk temp\\202006\\pages\\西语大纲词汇-整理.xls";
    private static String excel_path01 = "F:\\file\\202006\\西语大纲词汇-整理.xls";
    //新建四个list
    private static List<HashMap<String,Object>> chouxian_a = new ArrayList<HashMap<String,Object>>();//抽象A1、A2
    private static List<HashMap<String,Object>> chouxiang_b = new ArrayList<HashMap<String,Object>>();//抽象B1、B2
    private static List<HashMap<String,Object>> juti_a = new ArrayList<HashMap<String,Object>>();//具体A1、A2
    private static List<HashMap<String,Object>> juti_b = new ArrayList<HashMap<String,Object>>();//具体B1、B2

    //入口
    public static void main(String[] args) throws IOException {
        //getHtmlVal();
        //getExcelVal();
        //searchWord();
    }




    //获取网页内容
    public static void getHtmlVal() throws IOException {
        String[] urls = {
                global_url+"pages/chouxiang-A1-A2..html"//抽象A
                ,global_url+"pages/chouxiang-B1-B2..html"//抽象B
                ,global_url+"pages/juti-A1-A2..html"//具体A
                ,global_url+"pages/juti-B1-B2..html"//具体B
                };
        for (int i=0;i<1;i++){
            System.out.println("页面"+i+"：");
            Document document = Jsoup.connect(urls[i]).get();//获取对应url的值
            //System.out.println(document);
            //Elements elements = document.select("table[bgcolor=#E4E4E4]").select("tr");
            Elements elements = document.select("table");//获取table
            Integer table_len = elements.size();//获取table的数量
            for (int j=0;j<table_len;j++){
                HashMap<String,Object> map = new HashMap<String,Object>();
                //获取caption标签对应的值
                Elements elements_title = elements.get(j).select("caption");
                //将值转换为text
                String str_title = elements_title.text();
                //去掉数字
                boolean num_b = isContainNumber(str_title);//判断是否包含数字
                if (num_b)
                    str_title = str_title.replaceAll("[\\d.]","");
                //去掉链接内容
                boolean zkh_b = isContain_zkh(str_title);
                if (zkh_b){
                    int n = str_title.indexOf("[");
                    str_title = str_title.substring(0,n);
                }
                map.put("title",str_title);//添加标题
                //获取分类
                Elements elements_fenlei = elements.get(j).select("thead").select("th");
                //分类1
                map.put("fenlei1",elements_fenlei.get(0).text());
                //获取单词
                Elements elements_word = elements.get(j).select("tbody").select("tr");

                //获取第一列的单词,并去除a标签的值
                String word_val = Jsoup.clean(elements_word.html(),new Whitelist().addTags(new String[]{ "ul","li","h4","p"}));
                String all_word = Jsoup.parse(word_val).text();
                String regEx = getExcelVal();
                searchWord(regEx,all_word);
                //System.out.println(Jsoup.parse(word_val).text());

                //获取第二列的单词，并去除a标签的值


            }
            //System.out.println("len:"+elements.get(0).select());
        }

    }
    //查询,regEx为待匹配的单词，all_word为待遍历的文本，String regEx,String all_word
    public static boolean searchWord(String regEx,String all_word){
        //String regEx ="shop00";
        //检测要匹配的单词，单词的前面必须是空格或者标点符号
        Pattern pattern = Pattern.compile("[^\\w]"+regEx+"[^\\w]");
        //Matcher matcher = pattern.matcher("yesterday, I entered  a shop and bought a pencil.");
        Matcher matcher = pattern.matcher(all_word);
        boolean result= matcher.find();
        System.out.println(result);
        return result;
    }
    //读取Excel中的值
    private static String getExcelVal() throws IOException {
        File excel = new File(excel_path01);
        //HSSFWorkbook hssfWorkbook = HSSFWorkbookFactory.createWorkbook(excel);
        Workbook workbook = WorkbookFactory.create(excel);
        Sheet sheet1 = workbook.getSheetAt(0);
        Row row1 = sheet1.getRow(current_num);
        System.out.println(row1.getCell(0));
        current_num ++;
        return row1.getCell(0).toString();
    }
    //判断是否包含数字
    public static boolean isContainNumber(String company) {
        Pattern p = Pattern.compile("[0-9]");
        Matcher m = p.matcher(company);
        if (m.find()) {
            return true;
        }
        return false;
    }

    //判断是否包含“[”字符
    private static  boolean isContain_zkh(String str){
        if (str.contains("["))
            return true;
        else
            return false;
    }

}
